Filter criteria:

glue('Number of genes: ', nrow(datExpr), '\n',
     'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
     sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 2928
## Number of samples: 86 (51 ASD, 35 controls)

Dimensionality reduction using PCA

First principal component explains over 89.7% of the total variance (lower than 90%…)

reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
  
  datExpr = data.frame(datExpr)
  
  datExpr_pca = prcomp(datExpr, scale.=TRUE)
  last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>% 
    filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
  
  par(mfrow=c(1,2))
  plot(summary(datExpr_pca)$importance[2,], type='b')
  abline(v=substr(last_pc$ID, 3, nchar(last_pc$ID)), col='blue')
  plot(summary(datExpr_pca)$importance[3,], type='b')
  abline(h=var_explained, col='blue')
  
  
  print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
             var_explained*100, '% of the variance'))
  
  datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
  
  return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}

reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)

## Keeping top 11 components that explain 98% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output

rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr)

Perhaps the first component is determined by a small number of samples and by removing them we can lower the variance explained by it:

plot(sort(pca_output$rotation[,1], decreasing = TRUE))

Genes are still separated into two clouds of points:

datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()

Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant. Colouring by their log2 fold change we can see that the genes from the cloud on the top are overexpressed and the genes in the bottom one underexpressed.

load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')

# Remove Subject with ID = 'AN03345'
keep = datMeta$Subject_ID!='AN03345'
datMeta = datMeta[keep,]
datExpr = datExpr[,keep]

# Calculate differential expression p-value of each gene
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)

fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
ordered_top_genes = top_genes[match(rownames(datExpr), rownames(top_genes)),]
  
ASD_pvals = ordered_top_genes$adj.P.Val
log_fold_change = ordered_top_genes$logFC

pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
p1 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=ASD_pvals)) + geom_point(alpha=0.5) + theme_minimal()
p2 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=log_fold_change)) + geom_point() + theme_minimal() +
  scale_colour_gradient2()
grid.arrange(p1, p2, ncol=2)

rm(mod, corfit, lmfit, fit, ASD_pvals, log_fold_change, top_genes, ordered_top_genes, datExpr, datProbes, datSeq, p1, p2)

Clustering

clusterings = list()

K-means Clustering

set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
                                      algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 3
abline(v = best_k, col='blue')

datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster

Hierarchical Clustering

Chose k=6 as best number of clusters.

Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.

Younger ASD samples seem to be more similar to control samples than older ASD samples (pink cluster has most of the youngest samples). The yellow cluster is made of young ASD samples.

h_clusts = datExpr_redDim %>% dist %>% hclust
plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)

best_k = 6
clusterings[['hc']] = cutree(h_clusts, best_k)

Consensus Clustering

Samples are grouped into two big clusters and two outliers, the first big cluster has three main subclusters, two small subclusters and two outliers, and the second one two main subclustersand five small subclusters.

*Output plots in clustering_genes_03_20 folder

Independent Component Analysis

Following this paper’s guidelines:

  1. Run PCA and keep enough components to explain 60% of the variance

  2. Run ICA with that same number of nbComp as principal components kept to then filter them

  3. Select components with kurtosis > 3

  4. Assign genes to clusters with FDR<0.01 using the fdrtool package

ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]

clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c

Leaves most of the observations (~80%) without a cluster (every time it leaves out more genes):

ICA_clusters %>% rowSums %>% table
## .
##    0    1    2    3    4    5 
## 2355  425  112   27    8    1
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) + 
  geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
  theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()

WGCNA

best_power=56 but blockwiseModules only accepts powers up to 30, so 30 was used instead

best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(20, 60, by=2))
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1     20    0.238 -0.225          0.940     516       517   1130
## 2     22    0.302 -0.248          0.933     478       463   1080
## 3     24    0.366 -0.272          0.942     446       417   1040
## 4     26    0.436 -0.293          0.958     417       376    998
## 5     28    0.485 -0.314          0.949     391       339    963
## 6     30    0.522 -0.330          0.945     368       309    931
## 7     32    0.578 -0.352          0.967     347       282    901
## 8     34    0.625 -0.371          0.965     328       258    873
## 9     36    0.645 -0.384          0.954     311       236    847
## 10    38    0.688 -0.402          0.946     295       215    822
## 11    40    0.715 -0.415          0.938     281       200    799
## 12    42    0.740 -0.426          0.935     267       185    778
## 13    44    0.763 -0.442          0.930     255       172    757
## 14    46    0.787 -0.457          0.936     244       159    738
## 15    48    0.802 -0.469          0.933     233       148    719
## 16    50    0.809 -0.480          0.923     223       138    702
## 17    52    0.823 -0.493          0.924     214       128    685
## 18    54    0.839 -0.503          0.929     206       119    669
## 19    56    0.854 -0.515          0.933     198       111    654
## 20    58    0.858 -0.527          0.926     190       104    640
## 21    60    0.872 -0.534          0.937     183        98    626
network = datExpr_redDim %>% t %>% blockwiseModules(power=30, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)

It leaves 304 genes without a cluster:

clusterings[['WGCNA']] %>% table
## .
##    0    1    2    3    4 
##  304 1742  528  227  127

Gaussian Mixture Models with hard thresholding

Number of clusters that resemble more Gaussian mixtures = 39

n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')

best_k = 39
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)

Plot of clusters with their centroids in gray

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())

Trying with 27 clusters (27 has the 9th lowest value)

best_k = 27
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_2']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_2']] %>% table
## .
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 159  71 159 165 109 122  96 106 135  78  54 109 154  90 146  71 102  40 
##  19  20  21  22  23  24  25  26  27 
## 160 159  99  87  55  98  82  67 155

Trying with 6 clusters

best_k = 6
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_3']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_3']] %>% table
## .
##   1   2   3   4   5   6 
## 662 455 366 684 431 330

Manual clustering

Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.

manual_clusters = as.factor(as.numeric(-0.1*datExpr_redDim$PC1 + 0.05 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() + 
  geom_abline(slope=-0.1, intercept=0.05, color='gray') + theme_minimal()

names(manual_clusters) = rownames(datExpr_redDim)

clusterings[['Manual']] = manual_clusters

Cluster 2 has a bigger mean expression than cluster 1. There also seem to be more than one distributions in cluster 2, with two different means and three different standard deviations?

manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd), 
                             manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)

Separating cluster 2 genes by their sd:

c2_sd = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(sd)
rownames(c2_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_sd = c2_sd %>% GMM(3)

manual_clusters_data %>% ggplot(aes(x=sd)) +
  stat_function(fun=dnorm, n=100, colour='#99cc00', # green
                args=list(mean=gmm_c2_sd$centroids[1], sd=gmm_c2_sd$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour='#2eb8b8', # acqua
                args=list(mean=gmm_c2_sd$centroids[2], sd=gmm_c2_sd$covariance_matrices[2])) +
  stat_function(fun=dnorm, n=100, colour='#a64dff', # purple
                args=list(mean=gmm_c2_sd$centroids[3], sd=gmm_c2_sd$covariance_matrices[3])) +
  theme_minimal()

clusterings[['Manual2']] = as.character(clusterings[['Manual']])
clusterings[['Manual2']][clusterings[['Manual']]==1] = gmm_c2_sd$Log_likelihood %>% 
  apply(1, function(x) glue('1_',which.max(x)))

clusterings[['Manual2']] %>% table
## .
##    0  1_1  1_2  1_3 
## 1713  422  436  357
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output, 
   ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, 
   best_power, c, manual_clusters, manual_clusters_data, c2_sd, gmm_c2_sd, p1, p2, 
   pca_data_projection)

Compare clusterings

Using Adjusted Rand Index:

cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
  cluster1 = as.factor(clusterings[[i]])
  for(j in (i):length(clusterings)){
    cluster2 = as.factor(clusterings[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(7,7))

rm(i, j, cluster1, cluster2, cluster_sim)

Scatter plots

create_2D_plot = function(cat_var){
 ggplotly(plot_points %>% ggplot(aes_string(x='PC1', y='PC2', color=cat_var)) + 
          geom_point() + theme_minimal() + 
          xlab(paste0('PC1 (', round(summary(pca_output)$importance[2,1]*100,2),'%)')) +
          ylab(paste0('PC2 (', round(summary(pca_output)$importance[2,2]*100,2),'%)')))
}
create_3D_plot = function(cat_var){
  plot_points %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_points[,cat_var], size=1) %>% 
    layout(title = glue('Samples coloured by ', cat_var),
           scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca_output)$importance[2,1]*100,2),'%)')),
                        yaxis=list(title=glue('PC2 (',round(summary(pca_output)$importance[2,2]*100,2),'%)')),
                        zaxis=list(title=glue('PC3 (',round(summary(pca_output)$importance[2,3]*100,2),'%)'))))  
}

plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
              mutate(ID = rownames(.),                           km_clust = as.factor(clusterings[['km']]),
                hc_clust = as.factor(clusterings[['hc']]),       cc_l1_clust = as.factor(clusterings[['cc_l1']]),
                cc_clust = as.factor(clusterings[['cc_l2']]),    ica_clust = as.factor(clusterings[['ICA_min']]),
                n_ica_clust = as.factor(rowSums(ICA_clusters)),  gmm_clust = as.factor(clusterings[['GMM']]),
                gmm_clust2 = as.factor(clusterings[['GMM_2']]),  gmm_clust3 = as.factor(clusterings[['GMM_3']]),
                wgcna_clust = as.factor(clusterings[['WGCNA']]), manual_clust=as.factor(clusterings[['Manual']]),
                manual2_clust=as.factor(clusterings[['Manual2']]))

2D plots of clusterings

  • Simple methods seem to only partition the space in buckets using information from the first component

  • All clusterings except K-means manage to separate the two clouds at least partially, but no one does a good job

  • WGCNA creates clusters inverted between clouds

  • Manual classification + GMM by standard deviation clusters make sense

create_2D_plot('km_clust')
create_2D_plot('hc_clust')
create_2D_plot('cc_l1_clust')
create_2D_plot('cc_clust')
create_2D_plot('ica_clust')
create_2D_plot('gmm_clust')
create_2D_plot('gmm_clust2')
create_2D_plot('gmm_clust3')
create_2D_plot('wgcna_clust')
create_2D_plot('manual2_clust')

3D plots

create_3D_plot('ica_clust')
create_3D_plot('gmm_clust')
create_3D_plot('gmm_clust3')
create_3D_plot('wgcna_clust')
create_3D_plot('manual2_clust')